setwd("/mnt/picea/projects/aspseq/jfelten/T89-Laccaria-bicolor")
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(hyperSpec))
suppressPackageStartupMessages(library(LSD))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(pander))
suppressPackageStartupMessages(library(plotly))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(scatterplot3d))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(vsn))
source("~/Git/UPSCb/src/R/plot.multidensity.R")
source("~/Git/UPSCb/src/R/featureSelection.R")
pal <- brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
samples <- read_csv("~/Git/UPSCb/projects/T89-Laccaria-bicolor/doc/Samples.csv") %>%
mutate(Time=factor(Time)) %>%
mutate(Experiment=factor(Experiment))
## Parsed with column specification:
## cols(
## SciLifeID = col_character(),
## SampleName = col_double(),
## Time = col_double(),
## Experiment = col_character(),
## Replicate = col_character()
## )
counts <- suppressWarnings(read_csv("January/analysis/salmon/Lacbi-raw-unormalised-gene-expression_data.csv") %>%
column_to_rownames("X1") +
read_csv("analysis/salmon/Lacbi-raw-unormalised-gene-expression_data.csv") %>%
column_to_rownames("X1"))
## Parsed with column specification:
## cols(
## .default = col_double(),
## X1 = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_double(),
## X1 = col_character()
## )
## See spec(...) for full column specifications.
Check how many genes are never expressed
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))
## [1] "22.7% percent (5172) of 22822 genes are not expressed"
ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples[match(colnames(counts),samples$SciLifeID),]),
aes(x,y,col=Experiment,fill=Time)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())
Display the per-gene mean expression
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
The cumulative gene coverage is as expected
ggplot(melt(log10(rowMeans(counts))),aes(x=value)) +
geom_density() + ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 5172 rows containing non-finite values (stat_density).
The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar
dat <- as.data.frame(log10(counts)) %>% utils::stack() %>%
mutate(Experiment=samples$Experiment[match(ind,samples$SciLifeID)]) %>%
mutate(Time=samples$Time[match(ind,samples$SciLifeID)])
Color by Experiment
ggplot(dat,aes(x=values,group=ind,col=Experiment)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 269828 rows containing non-finite values (stat_density).
Color by Time
ggplot(dat,aes(x=values,group=ind,col=Time)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 269828 rows containing non-finite values (stat_density).
dir.create(file.path("analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file="analysis/salmon/Lacbi-all-raw-unormalised-gene-expression_data.csv")
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate
s.sel <- match(colnames(counts),samples$SciLifeID)
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = samples[s.sel,],
design = ~ Experiment * Time)
## converting counts to integer mode
## factor levels were dropped which had no samples
save(dds,file="analysis/salmon/Lacbi-all-dds.rda")
Check the size factors (i.e. the sequencing library size effect)
The sequencing depth is relatively variable (40 to 240 %)
dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
| P11915_101 | P11915_102 | P11915_103 | P11915_104 | P11915_105 | P11915_106 |
|---|---|---|---|---|---|
| 0.9527 | 2.379 | 2.111 | 1.024 | 1.194 | 1.415 |
| P11915_107 | P11915_112 | P11915_113 | P11915_114 | P11915_115 | P11915_116 |
|---|---|---|---|---|---|
| 1.443 | 1.303 | 0.6593 | 0.6641 | 1.116 | 1.105 |
| P11915_117 | P11915_118 | P11915_123 | P11915_124 | P11915_125 | P11915_126 |
|---|---|---|---|---|---|
| 1.4 | 1.119 | 0.9421 | 0.7283 | 1.344 | 0.5157 |
| P11915_127 | P11915_128 | P11915_129 | P11915_134 | P11915_135 | P11915_136 |
|---|---|---|---|---|---|
| 0.65 | 0.8961 | 0.9009 | 1.401 | 1.027 | 1 |
| P11915_137 | P11915_138 | P11915_139 | P11915_140 | P11915_145 | P11915_146 |
|---|---|---|---|---|---|
| 0.4518 | 1.064 | 0.8489 | 0.9296 | 1.737 | 1.297 |
| P11915_147 | P11915_148 | P11915_149 | P11915_150 | P11915_151 |
|---|---|---|---|---|
| 1.605 | 0.7587 | 0.7511 | 0.4468 | 0.9132 |
boxplot(sizes, main="Sequencing libraries size factor")
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
The variance stabilisation worked very well, the data is nearly homoscesdastic
meanSdPlot(vst[rowSums(counts)>0,])
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
This looks interesting as the sample separate clearly both by Experiment and Time in the first 2 components.
mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
color=pal[as.integer(samples$Time)[s.sel]],
pch=c(17,19)[as.integer(samples$Experiment)[s.sel]-1])
legend("topleft",
fill=pal[1:nlevels(samples$Time)],
legend=levels(samples$Time))
legend("bottomright",
pch=c(17,19),
legend=levels(samples$Experiment)[-1])
par(mar=mar)
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
samples[s.sel,])
p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Experiment,text=SciLifeID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))
ggplotly(p)
## Warning in if (nchar(axisTitleText) > 0) {: the condition has length > 1
## and only the first element will be used
## Warning in if (nchar(axisTitleText) > 0) {: the condition has length > 1
## and only the first element will be used
Filter for noise A cutoff at a VST value of 1 leaves about 15000 genes, which is adequate for the QA
conds <- factor(paste(samples$Experiment,samples$Time))[s.sel]
sels <- rangeFeatureSelect(counts=vst,
conditions=conds,
nrep=3)
heatmap.2(t(scale(t(vst[sels[[2]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
ord <- order(rowSds(vst[sels[[2]],]),decreasing=TRUE) [1:1000]
The subset is enriched for higher expression values
ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
total=rowMeans(vst[sels[[2]],])) %>% melt(),
aes(x=value,col=L1)) +
geom_density() +
ggtitle("Density of the subset vs. overall") +
scale_x_continuous(name=element_text("VST expression")) +
theme(legend.title=element_blank())
The clustering remains the same even for the most variable genes
heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
ord <- order(rowSds(vst[sels[[2]],])) [1:1000]
The subset is enriched for higher expression values, with a strinkingly normal distribution
ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
total=rowMeans(vst[sels[[2]],])) %>% melt(),
aes(x=value,col=L1)) +
geom_density() +
ggtitle("Density of the subset vs. overall") +
scale_x_continuous(name=element_text("VST expression")) +
theme(legend.title=element_blank())
The clustering for the least variable genes shows a separation by experiment and time point
heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
The quality of the data is good. The PCA shows that the samples cluster by experiment and time, however, the heatmap shows a clustering that correlates with the mapping rates, i.e. the mixed amount of reads originating from either organism. The final heatmap seem to indicate that this effect is neglectable albeit confounded.
counts <- suppressWarnings(read_csv("January/analysis/salmon/Potra-raw-unormalised-gene-expression_data.csv") %>%
column_to_rownames("X1") +
read_csv("analysis/salmon/Potra-raw-unormalised-gene-expression_data.csv") %>%
column_to_rownames("X1"))
## Parsed with column specification:
## cols(
## .default = col_double(),
## X1 = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_double(),
## X1 = col_character()
## )
## See spec(...) for full column specifications.
Check how many genes are never expressed
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))
## [1] "11.6% percent (3935) of 34043 genes are not expressed"
The reads are mostly from the Cont samples. In the ECM samples, the majority of the reads is of fungal origin
ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples[match(colnames(counts),samples$SciLifeID),]),
aes(x,y,col=Experiment,fill=Time)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())
Display the per-gene mean expression
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
The cumulative gene coverage is as expected
ggplot(melt(log10(rowMeans(counts))),aes(x=value)) +
geom_density() + ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 3935 rows containing non-finite values (stat_density).
The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar
dat <- as.data.frame(log10(counts)) %>% utils::stack() %>%
mutate(Experiment=samples$Experiment[match(ind,samples$SciLifeID)]) %>%
mutate(Time=samples$Time[match(ind,samples$SciLifeID)])
Color by Experiment
ggplot(dat,aes(x=values,group=ind,col=Experiment)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 433688 rows containing non-finite values (stat_density).
Color by Time
ggplot(dat,aes(x=values,group=ind,col=Time)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 433688 rows containing non-finite values (stat_density).
dir.create(file.path("analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file="analysis/salmon/Potra-all-raw-unormalised-gene-expression_data.csv")
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate
s.sel <- match(colnames(counts),samples$SciLifeID)
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = samples[s.sel,],
design = ~ Experiment * Time)
## converting counts to integer mode
## factor levels were dropped which had no samples
save(dds,file="analysis/salmon/Potra-all-dds.rda")
Check the size factors (i.e. the sequencing library size effect)
The sequencing depth is highly variable (10 to 500 %)
dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
| P11915_104 | P11915_105 | P11915_106 | P11915_107 | P11915_108 | P11915_109 |
|---|---|---|---|---|---|
| 0.09004 | 0.2524 | 0.1799 | 0.3481 | 2.86 | 2.21 |
| P11915_110 | P11915_111 | P11915_115 | P11915_116 | P11915_117 | P11915_118 |
|---|---|---|---|---|---|
| 2.183 | 1.316 | 0.1794 | 0.2533 | 0.1347 | 0.2124 |
| P11915_119 | P11915_120 | P11915_121 | P11915_122 | P11915_126 | P11915_127 |
|---|---|---|---|---|---|
| 2.263 | 2.94 | 2.474 | 2.13 | 1.101 | 0.2189 |
| P11915_128 | P11915_129 | P11915_130 | P11915_131 | P11915_132 | P11915_133 |
|---|---|---|---|---|---|
| 0.6912 | 0.5727 | 3.285 | 2.003 | 2.659 | 3.163 |
| P11915_137 | P11915_138 | P11915_139 | P11915_140 | P11915_141 | P11915_142 |
|---|---|---|---|---|---|
| 1.186 | 0.5998 | 0.199 | 0.258 | 2.13 | 4.864 |
| P11915_143 | P11915_144 | P11915_148 | P11915_149 | P11915_150 | P11915_151 |
|---|---|---|---|---|---|
| 3.768 | 1.938 | 1.634 | 0.5845 | 1.462 | 0.8478 |
| P11915_152 | P11915_153 | P11915_154 | P11915_155 |
|---|---|---|---|
| 2.605 | 4.047 | 5.025 | 2.015 |
boxplot(sizes, main="Sequencing libraries size factor")
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
The variance stabilisation worked adequately
meanSdPlot(vst[rowSums(counts)>0,])
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
This looks interesting as the sample separate also both by Experiment and Time in the first 2 components, but somewhat not as clearly as for Laccaria bicolor
mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
color=pal[as.integer(samples$Time)[s.sel]],
pch=c(17,19)[as.integer(samples$Experiment)[s.sel]])
legend("topleft",
fill=pal[1:nlevels(samples$Time)],
legend=levels(samples$Time))
legend("bottomright",
pch=c(17,19),
legend=levels(samples$Experiment)[-3])
par(mar=mar)
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
samples[s.sel,])
p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Experiment,text=SciLifeID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))
ggplotly(p)
## Warning in if (nchar(axisTitleText) > 0) {: the condition has length > 1
## and only the first element will be used
## Warning in if (nchar(axisTitleText) > 0) {: the condition has length > 1
## and only the first element will be used
Filter for noise A cutoff at a VST value of 2 leaves about 14000 genes, which is adequate for the QA
conds <- factor(paste(samples$Experiment,samples$Time))[s.sel]
sels <- rangeFeatureSelect(counts=vst,
conditions=conds,
nrep=3)
heatmap.2(t(scale(t(vst[sels[[3]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
ord <- order(rowSds(vst[sels[[2]],]),decreasing=TRUE) [1:1000]
The subset is enriched for higher expression values
ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
total=rowMeans(vst[sels[[2]],])) %>% melt(),
aes(x=value,col=L1)) +
geom_density() +
ggtitle("Density of the subset vs. overall") +
scale_x_continuous(name=element_text("VST expression")) +
theme(legend.title=element_blank())
The clustering shows a better clustering by time and experiment, even though there are still outliers
heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
ord <- order(rowSds(vst[sels[[2]],])) [1:1000]
The subset is enriched for higher expression values, with a strinkingly normal distribution
ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
total=rowMeans(vst[sels[[2]],])) %>% melt(),
aes(x=value,col=L1)) +
geom_density() +
ggtitle("Density of the subset vs. overall") +
scale_x_continuous(name=element_text("VST expression")) +
theme(legend.title=element_blank())
The clustering for the least variable genes is very similar to the overall profile
heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
The same conclusion as for Laccaria bicolor apply, despite the difference in the heatmap behaviour that may indicate some bias due to the organisms.
These results suggests that a few samples could be removed as being under-sequenced
We removed samples P11915_104, P11915_127 and P11915_139 as having too few counts compared to their replicates
rm.sel <- colnames(counts) %in% c("P11915_104","P11915_127","P11915_139")
pander(colSums(counts)[rm.sel])
| P11915_104 | P11915_127 | P11915_139 |
|---|---|---|
| 129865 | 327104 | 315206 |
boxplot(list(removed=colSums(counts)[rm.sel],kept=colSums(counts)[!rm.sel]))
counts <- counts[,!rm.sel]
Check how many genes are never expressed
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))
## [1] "11.6% percent (3951) of 34043 genes are not expressed"
The reads are mostly from the Cont samples. In the ECM samples, the majority of the reads is of fungal origin
ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples[match(colnames(counts),samples$SciLifeID),]),
aes(x,y,col=Experiment,fill=Time)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())
Display the per-gene mean expression
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
The cumulative gene coverage is as expected
ggplot(melt(log10(rowMeans(counts))),aes(x=value)) +
geom_density() + ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 3951 rows containing non-finite values (stat_density).
The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar
dat <- as.data.frame(log10(counts)) %>% utils::stack() %>%
mutate(Experiment=samples$Experiment[match(ind,samples$SciLifeID)]) %>%
mutate(Time=samples$Time[match(ind,samples$SciLifeID)])
Color by Experiment
p <- ggplot(dat,aes(x=values,group=ind,col=Experiment)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
ggplotly(p)
## Warning: Removed 386051 rows containing non-finite values (stat_density).
Color by Time
p <- ggplot(dat,aes(x=values,group=ind,col=Time)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
ggplotly(p)
## Warning: Removed 386051 rows containing non-finite values (stat_density).
dir.create(file.path("analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file="analysis/salmon/Potra-104-127-139-removed-raw-unormalised-gene-expression_data.csv")
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate
s.sel <- match(colnames(counts),samples$SciLifeID)
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = samples[s.sel,],
design = ~ Experiment * Time)
## converting counts to integer mode
## factor levels were dropped which had no samples
save(dds,file="analysis/salmon/Potra-104-127-139-removed-dds.rda")
Check the size factors (i.e. the sequencing library size effect)
The sequencing depth is highly variable (15 to 420 %)
dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
| P11915_105 | P11915_106 | P11915_107 | P11915_108 | P11915_109 | P11915_110 |
|---|---|---|---|---|---|
| 0.2151 | 0.1545 | 0.2963 | 2.469 | 1.901 | 1.886 |
| P11915_111 | P11915_115 | P11915_116 | P11915_117 | P11915_118 | P11915_119 |
|---|---|---|---|---|---|
| 1.125 | 0.1532 | 0.2175 | 0.1161 | 0.181 | 1.951 |
| P11915_120 | P11915_121 | P11915_122 | P11915_126 | P11915_128 | P11915_129 |
|---|---|---|---|---|---|
| 2.508 | 2.118 | 1.831 | 0.9529 | 0.5889 | 0.4883 |
| P11915_130 | P11915_131 | P11915_132 | P11915_133 | P11915_137 | P11915_138 |
|---|---|---|---|---|---|
| 2.857 | 1.722 | 2.283 | 2.76 | 1.02 | 0.5047 |
| P11915_140 | P11915_141 | P11915_142 | P11915_143 | P11915_144 | P11915_148 |
|---|---|---|---|---|---|
| 0.2169 | 1.848 | 4.2 | 3.243 | 1.689 | 1.42 |
| P11915_149 | P11915_150 | P11915_151 | P11915_152 | P11915_153 | P11915_154 |
|---|---|---|---|---|---|
| 0.5044 | 1.256 | 0.7182 | 2.255 | 3.538 | 4.329 |
| P11915_155 |
|---|
| 1.741 |
boxplot(sizes, main="Sequencing libraries size factor")
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
The variance stabilisation worked adequately
meanSdPlot(vst[rowSums(counts)>0,])
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
This looks interesting as the sample separate also both by Experiment and Time in the first 2 components, but somewhat not as clearly as for Laccaria bicolor
mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
color=pal[as.integer(samples$Time)[s.sel]],
pch=c(17,19)[as.integer(samples$Experiment)[s.sel]])
legend("topleft",
fill=pal[1:nlevels(samples$Time)],
legend=levels(samples$Time))
legend("bottomright",
pch=c(17,19),
legend=levels(samples$Experiment)[-3])
par(mar=mar)
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
samples[s.sel,])
p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Experiment,text=SciLifeID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))
ggplotly(p)
## Warning in if (nchar(axisTitleText) > 0) {: the condition has length > 1
## and only the first element will be used
## Warning in if (nchar(axisTitleText) > 0) {: the condition has length > 1
## and only the first element will be used
Filter for noise A cutoff at a VST value of 2 leaves about 14000 genes, which is adequate for the QA
conds <- factor(paste(samples$Experiment,samples$Time))[s.sel]
sels <- rangeFeatureSelect(counts=vst,
conditions=conds,
nrep=3)
heatmap.2(t(scale(t(vst[sels[[3]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
ord <- order(rowSds(vst[sels[[2]],]),decreasing=TRUE) [1:1000]
The subset is slightly enriched for higher expression values
ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
total=rowMeans(vst[sels[[2]],])) %>% melt(),
aes(x=value,col=L1)) +
geom_density() +
ggtitle("Density of the subset vs. overall") +
scale_x_continuous(name=element_text("VST expression")) +
theme(legend.title=element_blank())
The clustering shows a perfect clustering by experiment
heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
Plotting the same using the SciLife IDs instead
The 3 samples clustering together ECM 14, 28 and 28 (126, 148, 150) are those that are the most deeply sequenced. So there is a bias there. However, sample 137 (equally deeply sequenced) clusters with the other ECM 14 and 21 samples.
heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = samples$SciLifeID[s.sel],
col=hpal)
ord <- order(rowSds(vst[sels[[2]],])) [1:1000]
The subset is enriched for higher expression values, with a strinkingly normal distribution
ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
total=rowMeans(vst[sels[[2]],])) %>% melt(),
aes(x=value,col=L1)) +
geom_density() +
ggtitle("Density of the subset vs. overall") +
scale_x_continuous(name=element_text("VST expression")) +
theme(legend.title=element_blank())
The clustering for the least variable genes is very similar to the overall profile in the sense that it shows increased variation in the clustering pattern
heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
Using the ECM data for P. tremula x P. tremuloides in comparisons to Cont is going to be sub-optimal. Results will be biased by the lack of sequencing depth especially at the early time points. There still seem to be informative signal in the data, but the results will be partial. Luckily, this will only affect the TPR, i.e. we will miss on some real effects, but the FDR will not be affected
counts <- suppressWarnings(read_csv("January/analysis/salmon/Potri-raw-unormalised-gene-expression_data.csv") %>%
column_to_rownames("X1") +
read_csv("analysis/salmon/Potri-raw-unormalised-gene-expression_data.csv") %>%
column_to_rownames("X1"))
## Parsed with column specification:
## cols(
## .default = col_double(),
## X1 = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_double(),
## X1 = col_character()
## )
## See spec(...) for full column specifications.
Check how many genes are never expressed
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))
## [1] "16.5% percent (6807) of 41270 genes are not expressed"
The reads are mostly from the Cont samples. In the ECM samples, the majority of the reads is of fungal origin
ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples[match(colnames(counts),samples$SciLifeID),]),
aes(x,y,col=Experiment,fill=Time)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())
Display the per-gene mean expression
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
The cumulative gene coverage is as expected
ggplot(melt(log10(rowMeans(counts))),aes(x=value)) +
geom_density() + ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 6807 rows containing non-finite values (stat_density).
The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar
dat <- as.data.frame(log10(counts)) %>% utils::stack() %>%
mutate(Experiment=samples$Experiment[match(ind,samples$SciLifeID)]) %>%
mutate(Time=samples$Time[match(ind,samples$SciLifeID)])
Color by Experiment
ggplot(dat,aes(x=values,group=ind,col=Experiment)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 671471 rows containing non-finite values (stat_density).
Color by Time
ggplot(dat,aes(x=values,group=ind,col=Time)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 671471 rows containing non-finite values (stat_density).
dir.create(file.path("analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file="analysis/salmon/Potri-all-raw-unormalised-gene-expression_data.csv")
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate
s.sel <- match(colnames(counts),samples$SciLifeID)
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = samples[s.sel,],
design = ~ Experiment * Time)
## converting counts to integer mode
## factor levels were dropped which had no samples
save(dds,file="analysis/salmon/Potri-all-dds.rda")
Check the size factors (i.e. the sequencing library size effect)
The sequencing depth is highly variable (10 to 500 %)
dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
| P11915_104 | P11915_105 | P11915_106 | P11915_107 | P11915_108 | P11915_109 |
|---|---|---|---|---|---|
| 0.0896 | 0.2524 | 0.1809 | 0.3476 | 2.87 | 2.213 |
| P11915_110 | P11915_111 | P11915_115 | P11915_116 | P11915_117 | P11915_118 |
|---|---|---|---|---|---|
| 2.183 | 1.325 | 0.1816 | 0.2536 | 0.1359 | 0.2119 |
| P11915_119 | P11915_120 | P11915_121 | P11915_122 | P11915_126 | P11915_127 |
|---|---|---|---|---|---|
| 2.245 | 2.969 | 2.478 | 2.144 | 1.106 | 0.2158 |
| P11915_128 | P11915_129 | P11915_130 | P11915_131 | P11915_132 | P11915_133 |
|---|---|---|---|---|---|
| 0.6965 | 0.575 | 3.251 | 2.015 | 2.668 | 3.139 |
| P11915_137 | P11915_138 | P11915_139 | P11915_140 | P11915_141 | P11915_142 |
|---|---|---|---|---|---|
| 1.223 | 0.6112 | 0.1993 | 0.2585 | 2.099 | 4.847 |
| P11915_143 | P11915_144 | P11915_148 | P11915_149 | P11915_150 | P11915_151 |
|---|---|---|---|---|---|
| 3.82 | 1.902 | 1.632 | 0.5881 | 1.465 | 0.8566 |
| P11915_152 | P11915_153 | P11915_154 | P11915_155 |
|---|---|---|---|
| 2.564 | 3.973 | 5.014 | 1.996 |
boxplot(sizes, main="Sequencing libraries size factor")
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
The variance stabilisation worked adequately
meanSdPlot(vst[rowSums(counts)>0,])
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
This looks interesting as the sample separate also both by Experiment and Time in the first 2 components, but somewhat not as clearly as for Laccaria bicolor
mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
color=pal[as.integer(samples$Time)[s.sel]],
pch=c(17,19)[as.integer(samples$Experiment)[s.sel]])
legend("topleft",
fill=pal[1:nlevels(samples$Time)],
legend=levels(samples$Time))
legend("bottomright",
pch=c(17,19),
legend=levels(samples$Experiment)[-3])
par(mar=mar)
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
samples[s.sel,])
p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Experiment,text=SciLifeID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))
ggplotly(p)
## Warning in if (nchar(axisTitleText) > 0) {: the condition has length > 1
## and only the first element will be used
## Warning in if (nchar(axisTitleText) > 0) {: the condition has length > 1
## and only the first element will be used
Filter for noise A cutoff at a VST value of 2 leaves about 14000 genes, which is adequate for the QA
conds <- factor(paste(samples$Experiment,samples$Time))[s.sel]
sels <- rangeFeatureSelect(counts=vst,
conditions=conds,
nrep=3)
heatmap.2(t(scale(t(vst[sels[[3]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
ord <- order(rowSds(vst[sels[[2]],]),decreasing=TRUE) [1:1000]
The subset is enriched for higher expression values
ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
total=rowMeans(vst[sels[[2]],])) %>% melt(),
aes(x=value,col=L1)) +
geom_density() +
ggtitle("Density of the subset vs. overall") +
scale_x_continuous(name=element_text("VST expression")) +
theme(legend.title=element_blank())
The clustering shows a better clustering by time and experiment, even though there are still outliers
heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
ord <- order(rowSds(vst[sels[[2]],])) [1:1000]
The subset is enriched for higher expression values, with a strinkingly normal distribution
ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
total=rowMeans(vst[sels[[2]],])) %>% melt(),
aes(x=value,col=L1)) +
geom_density() +
ggtitle("Density of the subset vs. overall") +
scale_x_continuous(name=element_text("VST expression")) +
theme(legend.title=element_blank())
The clustering for the least variable genes is very similar to the overall profile
heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
The same conclusion as for Laccaria bicolor apply, despite the difference in the heatmap behaviour that may indicate some bias due to the organisms.
These results suggests that a few samples could be removed as being under-sequenced
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] vsn_3.52.0 forcats_0.4.0
## [3] stringr_1.4.0 dplyr_0.8.3
## [5] purrr_0.3.2 readr_1.3.1
## [7] tidyr_0.8.3 tibble_2.1.3
## [9] tidyverse_1.2.1 scatterplot3d_0.3-41
## [11] RColorBrewer_1.1-2 plotly_4.9.0
## [13] pander_0.6.3 LSD_4.0-0
## [15] hyperSpec_0.99-20180627 ggplot2_3.2.1
## [17] lattice_0.20-38 gplots_3.0.1.1
## [19] DESeq2_1.24.0 SummarizedExperiment_1.14.1
## [21] DelayedArray_0.10.0 BiocParallel_1.18.1
## [23] matrixStats_0.54.0 Biobase_2.44.0
## [25] GenomicRanges_1.36.0 GenomeInfoDb_1.20.0
## [27] IRanges_2.18.2 S4Vectors_0.22.0
## [29] BiocGenerics_0.30.0 data.table_1.12.2
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 htmlTable_1.13.1 XVector_0.24.0
## [4] base64enc_0.1-3 rstudioapi_0.10 hexbin_1.27.3
## [7] affyio_1.54.0 bit64_0.9-7 AnnotationDbi_1.46.1
## [10] lubridate_1.7.4 xml2_1.2.2 splines_3.6.1
## [13] geneplotter_1.62.0 knitr_1.24 zeallot_0.1.0
## [16] Formula_1.2-3 jsonlite_1.6 broom_0.5.2
## [19] annotate_1.62.0 cluster_2.1.0 shiny_1.3.2
## [22] BiocManager_1.30.4 compiler_3.6.1 httr_1.4.1
## [25] backports_1.1.4 assertthat_0.2.1 Matrix_1.2-17
## [28] lazyeval_0.2.2 limma_3.40.6 cli_1.1.0
## [31] later_0.8.0 acepack_1.4.1 htmltools_0.3.6
## [34] tools_3.6.1 affy_1.62.0 gtable_0.3.0
## [37] glue_1.3.1 GenomeInfoDbData_1.2.1 reshape2_1.4.3
## [40] Rcpp_1.0.2 cellranger_1.1.0 vctrs_0.2.0
## [43] preprocessCore_1.46.0 gdata_2.18.0 nlme_3.1-141
## [46] crosstalk_1.0.0 xfun_0.9 testthat_2.2.1
## [49] rvest_0.3.4 mime_0.7 gtools_3.8.1
## [52] XML_3.98-1.20 zlibbioc_1.30.0 scales_1.0.0
## [55] promises_1.0.1 hms_0.5.1 yaml_2.2.0
## [58] memoise_1.1.0 gridExtra_2.3 rpart_4.1-15
## [61] latticeExtra_0.6-28 stringi_1.4.3 RSQLite_2.1.2
## [64] highr_0.8 genefilter_1.66.0 checkmate_1.9.4
## [67] caTools_1.17.1.2 rlang_0.4.0 pkgconfig_2.0.2
## [70] bitops_1.0-6 evaluate_0.14 labeling_0.3
## [73] htmlwidgets_1.3 bit_1.1-14 tidyselect_0.2.5
## [76] plyr_1.8.4 magrittr_1.5 R6_2.4.0
## [79] generics_0.0.2 Hmisc_4.2-0 DBI_1.0.0
## [82] pillar_1.4.2 haven_2.1.1 foreign_0.8-72
## [85] withr_2.1.2 survival_2.44-1.1 RCurl_1.95-4.12
## [88] nnet_7.3-12 modelr_0.1.5 crayon_1.3.4
## [91] KernSmooth_2.23-15 rmarkdown_1.15 locfit_1.5-9.1
## [94] readxl_1.3.1 blob_1.2.0 digest_0.6.20
## [97] xtable_1.8-4 httpuv_1.5.1 munsell_0.5.0
## [100] viridisLite_0.3.0